#library(nCov2019)
library(leaflet)
library(dplyr)
library(ggplot2)
library(plotly)
library(scales)
library(xts)
library(dygraphs)
library(corrplot)
library(lubridate)
library(fmsb)
COVID<-read.csv("covid_19_data.csv")
COVID_2<-read.csv("COVID19_10-Apr.csv")
Format date:
Date<-as.Date(COVID_2$Date, format="%m/%d/%y")
COVID_2$Date2<-Date
COVID_updated<-COVID_2 %>% filter(Date2==max(Date2))
leaflet(width = "100%") %>%
addProviderTiles("CartoDB.DarkMatter") %>%
setView(lng = 0, lat = 10, zoom = 1.5) %>%
addCircleMarkers(data = COVID_updated,
lng = ~ Long,
lat = ~ Lat,
radius = ~ log(Confirmed+1),
color = rgb(218/255,65/255,56/255),
fillOpacity = ~ ifelse(Confirmed > 0, 1, 0),
stroke = FALSE,
label = ~ paste(Province.State,",",Country.Region, ": ", Confirmed)
)
Current top 10 countries:
COVID_top<-COVID_2 %>% filter(Date2==max(Date2)) %>%
group_by(Country.Region) %>% summarise(Total_confirmed=sum(Confirmed)) %>%
top_n(10,Total_confirmed) %>% arrange(desc(Total_confirmed))
plot<-ggplot(data=COVID_top
, aes(x=Total_confirmed,y=reorder(Country.Region,Total_confirmed))) +
geom_bar(stat ="identity",alpha=0.8,fill="firebrick3") +
geom_text(aes(label=Total_confirmed), vjust=0.5, hjust=0.9,color="black", size=3.5) +
scale_x_continuous(labels = comma) +
labs(title = paste("Top 10 countries with confirmed cases as of ",max(COVID_2$Date2)),
x = "Confirmed cases",
y = "Country") +
theme_minimal()
ggplotly(plot,tooltip = c("x"),width=750)
Time distribution:
COVID_2_Day<- COVID_2 %>% group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed),
World_deaths=sum(Deaths),
World_recovered=sum(Recovered))
COVID_Day_confirmed_series<-xts(COVID_2_Day$World_confirmed, order.by=COVID_2_Day$Date2)
COVID_Day_deaths_series<-xts(COVID_2_Day$World_deaths, order.by=COVID_2_Day$Date2)
COVID_Day_recovered_series<-xts(COVID_2_Day$World_recovered, order.by=COVID_2_Day$Date2)
Day_summary<-cbind(COVID_Day_confirmed_series,COVID_Day_deaths_series,COVID_Day_recovered_series)
dygraph(Day_summary, main = "SARS-COV2-outbreak: Total worldwide cases",
xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_Day_confirmed_series", "Total cases",drawPoints = TRUE,
pointSize = 3, color=rgb(53/255,116/255,199/255)) %>%
dySeries("COVID_Day_deaths_series", "Total deaths",drawPoints = TRUE,
pointSize = 3, color=rgb(189/255,55/255,48/255)) %>%
dySeries("COVID_Day_recovered_series", "Total recovered",drawPoints = TRUE,
pointSize = 3, color=rgb(69/255,136/255,51/255)) %>%
dyRangeSelector()
New_count<-function(x)
{
Daily_cases<-numeric(length(x))
for(i in length(x):2)
{
Daily_cases[i]=x[i] - x[i-1]
}
return(Daily_cases)
}
New_cases<-New_count(COVID_2_Day$World_confirmed)
New_deaths<-New_count(COVID_2_Day$World_deaths)
New_recovered<-New_count(COVID_2_Day$World_recovered)
COVID_New_confirmed_series<-xts(New_cases, order.by=COVID_2_Day$Date2)
COVID_New_deaths_series<-xts(New_deaths, order.by=COVID_2_Day$Date2)
COVID_New_recovered_series<-xts(New_recovered, order.by=COVID_2_Day$Date2)
New_summary<-cbind(COVID_New_confirmed_series,COVID_New_deaths_series,COVID_New_recovered_series)
dygraph(New_summary, main = "SARS-COV2-outbreak: Total worldwide cases",
xlab="Date", ylab="Novel coronavirus cases",width = 750) %>%
dySeries("COVID_New_confirmed_series", "New cases",drawPoints = TRUE,
pointSize = 3, color=rgb(53/255,116/255,199/255)) %>%
dySeries("COVID_New_deaths_series", "New deaths",drawPoints = TRUE,
pointSize = 3, color=rgb(189/255,55/255,48/255)) %>%
dySeries("COVID_New_recovered_series", "New recovered",drawPoints = TRUE,
pointSize = 3, color=rgb(69/255,136/255,51/255)) %>%
dyRangeSelector()
Team members countries total cases:
COVID_2_Day_Lebanon<- COVID_2 %>%
filter(Country.Region %in% c("Lebanon")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Chile<- COVID_2 %>%
filter(Country.Region %in% c("Chile")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Colombia<- COVID_2 %>%
filter(Country.Region %in% c("Colombia")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_CostaRica<- COVID_2 %>%
filter(Country.Region %in% c("Costa Rica")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_Day_series_Lebanon<-xts(COVID_2_Day_Lebanon$World_confirmed, order.by=COVID_2_Day_Lebanon$Date2)
COVID_Day_series_Chile<-xts(COVID_2_Day_Chile$World_confirmed, order.by=COVID_2_Day_Chile$Date2)
COVID_Day_series_Colombia<-xts(COVID_2_Day_Colombia$World_confirmed, order.by=COVID_2_Day_Colombia$Date2)
COVID_Day_series_CostaRica<-xts(COVID_2_Day_CostaRica$World_confirmed, order.by=COVID_2_Day_CostaRica$Date2)
Our_Countries<-cbind(COVID_Day_series_Lebanon,COVID_Day_series_Chile,COVID_Day_series_Colombia,COVID_Day_series_CostaRica)
dygraph(Our_Countries, main = "SARS-COV2-outbreak: Total cases by country", xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_Day_series_Lebanon", "Lebanon",drawPoints = TRUE,
pointSize = 3, color=rgb(0,0,3/255)) %>%
dySeries("COVID_Day_series_Chile", "Chile",drawPoints = TRUE,
pointSize = 3,color=rgb(120/255,28/255,109/255)) %>%
dySeries("COVID_Day_series_Colombia", "Colombia",drawPoints = TRUE,
pointSize = 3,color=rgb(237/255,105/255,37/255)) %>%
dySeries("COVID_Day_series_CostaRica", "Costa Rica",drawPoints = TRUE,
pointSize = 3,color=rgb(204/255,197/255,126/255)) %>%
dyRangeSelector()
New_Lebanon<-New_count(COVID_2_Day_Lebanon$World_confirmed)
New_Chile<-New_count(COVID_2_Day_Chile$World_confirmed)
New_Colombia<-New_count(COVID_2_Day_Colombia$World_confirmed)
New_CostaRica<-New_count(COVID_2_Day_CostaRica$World_confirmed)
COVID_New_series_Lebanon<-xts(New_Lebanon, order.by=COVID_2_Day_Lebanon$Date2)
COVID_New_series_Chile<-xts(New_Chile, order.by=COVID_2_Day_Chile$Date2)
COVID_New_series_Colombia<-xts(New_Colombia, order.by=COVID_2_Day_Colombia$Date2)
COVID_New_series_CostaRica<-xts(New_CostaRica, order.by=COVID_2_Day_CostaRica$Date2)
Our_New_Countries<-cbind(COVID_New_series_Lebanon,COVID_New_series_Chile,COVID_New_series_Colombia,COVID_New_series_CostaRica)
dygraph(Our_New_Countries, main = "SARS-COV2-outbreak: New cases by country", xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_New_series_Lebanon", "Lebanon",drawPoints = TRUE,
pointSize = 3, color=rgb(0,0,3/255)) %>%
dySeries("COVID_New_series_Chile", "Chile",drawPoints = TRUE,
pointSize = 3,color=rgb(120/255,28/255,109/255)) %>%
dySeries("COVID_New_series_Colombia", "Colombia",drawPoints = TRUE,
pointSize = 3,color=rgb(237/255,105/255,37/255)) %>%
dySeries("COVID_New_series_CostaRica", "Costa Rica",drawPoints = TRUE,
pointSize = 3,color=rgb(204/255,197/255,126/255)) %>%
dyRangeSelector()
fig <- plot_ly(COVID_updated, x = ~Confirmed, y = ~Deaths, z = ~Recovered, width=750) %>%
add_markers(text= ~Country.Region ,hoverinfo= "text",
marker = list(color=rgb(189/255,55/255,48/255))) %>%
layout(title="Confirmed cases Vs. Deaths Vs. Recovered", scene = list(
xaxis = list(title = 'Confirmed'),
yaxis = list(title = 'Deaths'),
zaxis = list(title = 'Recovered')))
fig
HDI<-read.csv("Human Development Index (HDI)_2.csv",sep=";",dec=",")
COVID_Country<-COVID_2 %>% filter(Date2==max(Date2)) %>%
group_by(Country.Region) %>% summarise(Total_confirmed=sum(Confirmed),
Total_deaths=sum(Deaths),
Total_Recovered=sum(Recovered))
Remove after parentheses:
HDI$Country_2<-gsub("\\s*\\([^\\)]+\\)","",as.character(HDI$Country))
HDI$Country_2[HDI$Country_2=="United States"]<-"US"
HDI$Country_2[HDI$Country_2=="Korea"]<-"South Korea"
Population:
Population<-read.csv("World_population.csv",sep=";",dec=",")
Remove after commma:
Population$Country_Name_2<-gsub(",.*", "", as.character(Population$Country_Name))
Population$Country_Name_2[Population$Country_Name_2=="United States"]<-"US"
Population$Country_Name_2[Population$Country_Code=="KOR"]<-"South Korea"
Population$Country_Name_2[Population$Country_Code=="CZE"]<-"Czechia"
Natural Join:
COVID_3<- COVID_Country %>% inner_join(HDI,by=c("Country.Region"="Country_2")) %>%
inner_join(Population,by=c("Country.Region"="Country_Name_2")) %>%
select(Country.Region,Total_confirmed,Total_deaths,Total_Recovered,HDI_Rank_2018,Year_2018,
Country_Code,Population_2018) %>%
mutate(Cases_million=(Total_confirmed/Population_2018)*1000000,
Recovered_percentage=(Total_Recovered/Total_confirmed)*100)
COVID_3<-COVID_3[!is.na(COVID_3$Population_2018),]
Plot the Human Development Index(HDI) Vs. the number of cases (applying a log transformation), and the proportion of recovered cases:
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Year_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(237/255,105/255,37/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="HDI Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="HDI")
ggplotly(plot,tooltip = c("text"),width=750)
COVID_numeric_1<-COVID_3 %>% mutate(Log_cases=log(Cases_million),
Death_percentage=(Total_deaths/Total_confirmed)*100) %>%
select(Log_cases,Recovered_percentage,Death_percentage,Year_2018)
corrplot(cor(COVID_numeric_1),method = "number",tl.col="black",tl.srt=15,
col=colorRampPalette(c(rgb(204/255,197/255,126/255),rgb(237/255,105/255,37/255)))(200))
Health_expenditure<-read.csv("Health_expenditure_GDP.csv",sep=";",dec=".")
Health_expenditure$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(Health_expenditure$Country))
Health_expenditure$Country_2[Health_expenditure$Country_2=="United States"]<-"US"
Health_expenditure$Country_2[Health_expenditure$Country_2=="Korea"]<-"South Korea"
COVID_3<- COVID_3 %>% inner_join(Health_expenditure,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
#write.csv(COVID_3,"COVID_Covariables.csv")
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Expenditure_2016,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(204/255,197/255,126/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="Health expenditure (% of GDP) Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="% of GDP in health")
ggplotly(plot,tooltip = c("text"),width=750)
Temperature<-read.csv("GlobalLandTemperaturesByCountry.csv",sep=";")
Date_Temp<-as.Date(Temperature$dt, format="%d/%m/%Y")
Temperature$Date_Temp<-Date_Temp
Extract month, filter IQ (Jan, Feb and Mar) and obtain the average IQ temperature:
Temperature<- Temperature %>% mutate(Month=month(Temperature$Date_Temp,label =TRUE),
Year=year(Temperature$Date_Temp)) %>%
filter(Month %in% c("Jan","Feb","Mar") & Year>=2000) %>%
group_by(Country) %>% summarise(Avg_IQ_temperature=mean(AverageTemperature))
Temperature<-Temperature[!is.na(Temperature$Avg_IQ_temperature),]
Temperature$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(Temperature$Country))
Temperature$Country_2[Temperature$Country_2=="United States"]<-"US"
Temperature$Country_2[Temperature$Country_2=="Korea"]<-"South Korea"
COVID_3<- COVID_3 %>% inner_join(Temperature,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
#write.csv(COVID_3,"COVID_Covariables.csv")
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Avg_IQ_temperature,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(120/255,28/255,109/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="Average IQ temperature Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="Temperature (Celcius)")
ggplotly(plot,tooltip = c("text"),width=750)
###DTP immunization
DTP_immunization<-read.csv("Infants_lacking_immunization_DTP.csv",sep=";")
Remove after parentheses:
DTP_immunization$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(DTP_immunization$Country))
DTP_immunization$Country_2[DTP_immunization$Country_2=="United States"]<-"US"
DTP_immunization$Country_2[DTP_immunization$Country_2=="Korea"]<-"South Korea"
COVID_4<- COVID_Country %>% inner_join(DTP_immunization,
by=c("Country.Region"="Country_2")) %>%
select(Country.Region,Total_confirmed,Total_deaths,Total_Recovered,
Lack_DTP_inmmunization_2018) %>%
mutate(Recovered_percentage=(Total_Recovered/Total_confirmed)*100,
Death_rate=(Total_deaths/Total_confirmed)*100)
plot<-ggplot(data=COVID_4,aes(x=Death_rate,y=Lack_DTP_inmmunization_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(204/255,197/255,126/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="% infants lacking DTP immunization Vs. death rate and \n proportion of recovered",
x="Novel coronavirus death rate",
y="% of infants")
ggplotly(plot,tooltip = c("text"),width=750)
Measles<-read.csv("Measles_immunization.csv",sep=";",dec=".")
Measles$Country_2<-gsub("\\s*\\([^\\)]+\\)","",as.character(Measles$Country))
Measles$Country_2[Measles$Country_2=="United States"]<-"US"
Measles$Country_2[Measles$Country_2=="Korea"]<-"South Korea"
COVID_4<- COVID_4 %>% inner_join(Measles,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
ggplot(COVID_4, aes(y=Measles_2018)) +
geom_boxplot(fill="dodgerblue4",outlier.shape = 21,
outlier.fill = "firebrick",alpha=0.75) +
ggtitle("Boxplot of % infants lacking measles immunization") + ylab("% of infants") +
theme_minimal()
ggplot(COVID_4, aes(Measles_2018)) +
geom_histogram(fill="dodgerblue4",bins=20,alpha=0.8) +
ggtitle("Histogram of % infants lacking measles immunization") +
xlab("% of infants") +
ylab("Count") +
theme_minimal()
plot<-ggplot(data=COVID_4,aes(x=Death_rate,y=Measles_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(237/255,105/255,37/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="% infants lacking measles immunization Vs. fatality rate \n and proportion of recovered",
x="Fatality rate (%)",
y="% of infants")
ggplotly(plot,tooltip = c("text"),width=750)
Transforming with ln and converting temperature as kelvin:
Mod1<-lm(log(COVID_3$Cases_million)~log(COVID_3$Year_2018)+log(COVID_3$Measles_2018)+
log(COVID_3$Expenditure_2016)+log(COVID_3$Avg_IQ_temperature+273.15))
summary(Mod1)
Call:
lm(formula = log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) +
log(COVID_3$Measles_2018) + log(COVID_3$Expenditure_2016) +
log(COVID_3$Avg_IQ_temperature + 273.15))
Residuals:
Min 1Q Median 3Q Max
-3.7927 -0.8026 -0.0988 0.7685 4.6646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 28.91233 18.71752 1.545 0.12462
log(COVID_3$Year_2018) 8.09219 0.65485 12.357 < 2e-16 ***
log(COVID_3$Measles_2018) 0.07961 0.11782 0.676 0.50030
log(COVID_3$Expenditure_2016) 1.00758 0.30867 3.264 0.00137 **
log(COVID_3$Avg_IQ_temperature + 273.15) -4.26198 3.29459 -1.294 0.19786
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.36 on 144 degrees of freedom
Multiple R-squared: 0.723, Adjusted R-squared: 0.7153
F-statistic: 93.96 on 4 and 144 DF, p-value: < 2.2e-16
Stepwise with AIC critertion:
Mod2<-step(Mod1,direction = "both")
Start: AIC=88.41
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Measles_2018) +
log(COVID_3$Expenditure_2016) + log(COVID_3$Avg_IQ_temperature +
273.15)
Df Sum of Sq RSS AIC
- log(COVID_3$Measles_2018) 1 1.326 252.72 87.188
<none> 251.39 88.410
- log(COVID_3$Avg_IQ_temperature + 273.15) 1 3.627 255.02 88.530
- log(COVID_3$Expenditure_2016) 1 19.903 271.29 97.687
- log(COVID_3$Year_2018) 1 267.232 518.62 193.587
Step: AIC=87.19
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Expenditure_2016) +
log(COVID_3$Avg_IQ_temperature + 273.15)
Df Sum of Sq RSS AIC
<none> 252.72 87.188
- log(COVID_3$Avg_IQ_temperature + 273.15) 1 3.451 256.17 87.196
+ log(COVID_3$Measles_2018) 1 1.326 251.39 88.410
- log(COVID_3$Expenditure_2016) 1 20.945 273.66 96.973
- log(COVID_3$Year_2018) 1 310.195 562.91 203.715
summary(Mod2)
Call:
lm(formula = log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) +
log(COVID_3$Expenditure_2016) + log(COVID_3$Avg_IQ_temperature +
273.15))
Residuals:
Min 1Q Median 3Q Max
-3.6304 -0.8232 -0.0390 0.8247 4.5290
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 30.1643 18.2214 1.655 0.100014
log(COVID_3$Year_2018) 7.6731 0.5772 13.295 < 2e-16 ***
log(COVID_3$Expenditure_2016) 1.0349 0.2996 3.455 0.000724 ***
log(COVID_3$Avg_IQ_temperature + 273.15) -4.4971 3.2069 -1.402 0.162977
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.325 on 144 degrees of freedom
Multiple R-squared: 0.7221, Adjusted R-squared: 0.7163
F-statistic: 124.7 on 3 and 144 DF, p-value: < 2.2e-16
Normality of residuals:
hist(Mod2$residuals)
shapiro.test(Mod2$residuals)
Shapiro-Wilk normality test
data: Mod2$residuals
W = 0.98758, p-value = 0.2094
Prediction power: separate between training and testing:
set.seed(179819)
Sample <- sample(1:length(COVID_3$Cases_million),length(COVID_3$Cases_million)*0.20)
t.testing<- COVID_3[Sample,]
t.training<- COVID_3[-Sample,]
Transform the training and testing variables as before:
t.training<-t.training %>% mutate(Cases_million_log=log(Cases_million),HDI_log=log(Year_2018),
GDP_log=log(Expenditure_2016),
Temperature_log_kelvin=log(Avg_IQ_temperature+273.15))
t.training<-t.training[,14:17]
t.testing<-t.testing %>% mutate(Cases_million_log=log(Cases_million),HDI_log=log(Year_2018),
GDP_log=log(Expenditure_2016),
Temperature_log_kelvin=log(Avg_IQ_temperature+273.15))
t.testing<-t.testing[,14:17]
Fit the same model with training
Mod3<-lm(Cases_million_log~., data=t.training)
summary(Mod3)
Call:
lm(formula = Cases_million_log ~ ., data = t.training)
Residuals:
Min 1Q Median 3Q Max
-3.4682 -0.7667 -0.0638 0.7399 4.3449
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.0811 18.5629 2.105 0.03744 *
HDI_log 7.1009 0.6266 11.332 < 2e-16 ***
GDP_log 1.0890 0.3331 3.270 0.00142 **
Temperature_log_kelvin -6.1107 3.2665 -1.871 0.06393 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.252 on 115 degrees of freedom
Multiple R-squared: 0.7275, Adjusted R-squared: 0.7204
F-statistic: 102.4 on 3 and 115 DF, p-value: < 2.2e-16
Error functions:
# Residual Sum of Square (RSS)
RSS<-function(Pred,Actual) {
ss<-sum((Actual-Pred)^2)
return(ss)
}
# Residual Standard Error (RSE)
RSE<-function(Pred,Real,NumPred) {
N<-length(Real)-NumPred-1
ss<-sqrt((1/N)*RSS(Pred,Real))
return(ss)
}
# Mean Squared Error
MSE <- function(Pred,Actual) {
N<-length(Actual)
ss<-(1/N)*RSS(Pred,Actual)
return(ss)
}
# Relative error
RelativeError<-function(Pred,Actual) {
ss<-sum(abs(Actual-Pred))/sum(abs(Actual))
return(ss)
}
Prediction:
Pred<-predict(Mod3,t.testing)
Errors:
RSS_Mod3<-RSS(Pred,t.testing$Cases_million_log)
RSE_Mod3<-RSE(Pred,t.testing$Cases_million_log,dim(t.testing)[2]-1)
MSE_Mod3<-MSE(Pred,t.testing$Cases_million_log)
RelativeError_Mod3<-RelativeError(Pred,t.testing$Cases_million_log)
Mod3_Errors<-c(RSS_Mod3,RSE_Mod3,MSE_Mod3,RelativeError_Mod3)
Now, a model without temperature:
t.training <- t.training %>% select(-Temperature_log_kelvin)
t.testing <- t.testing %>% select(-Temperature_log_kelvin)
Mod4<-lm(Cases_million_log~., data=t.training)
summary(Mod4)
Call:
lm(formula = Cases_million_log ~ ., data = t.training)
Residuals:
Min 1Q Median 3Q Max
-3.7017 -0.8447 0.0129 0.7149 4.3942
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.3797 0.7074 6.191 9.3e-09 ***
HDI_log 7.6548 0.5582 13.713 < 2e-16 ***
GDP_log 1.2422 0.3263 3.807 0.000227 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.266 on 116 degrees of freedom
Multiple R-squared: 0.7192, Adjusted R-squared: 0.7144
F-statistic: 148.6 on 2 and 116 DF, p-value: < 2.2e-16
Prediction:
Pred<-predict(Mod4,t.testing)
Errors:
RSS_Mod4<-RSS(Pred,t.testing$Cases_million_log)
RSE_Mod4<-RSE(Pred,t.testing$Cases_million_log,dim(t.testing)[2]-1)
MSE_Mod4<-MSE(Pred,t.testing$Cases_million_log)
RelativeError_Mod4<-RelativeError(Pred,t.testing$Cases_million_log)
Mod4_Errors<-c(RSS_Mod4,RSE_Mod4,MSE_Mod4,RelativeError_Mod4)
Create a radarplot:
Errors<-rbind(Mod3_Errors,Mod4_Errors)
rownames(Errors)<-c("Model with temperature","Model without temperature")
colnames(Errors)<-c("Residual Sum of Square","Residual Standard Error","Mean Squared Error","Relative error")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legenda <-legend(1.5,1, legend=c("With temperature","Without temperature"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))